EDA

The beginning

bank <- data.frame(read.csv(file="bank.csv"))
# changing the data types 
bank[] <- lapply(bank, as.factor)
bank$age <- as.numeric(bank$age)
bank$balance <- as.numeric(bank$balance)
bank$day <- as.numeric(bank$day)
bank$duration <- as.numeric(bank$duration)
bank$campaign <- as.numeric(bank$campaign)
bank$previous <- as.numeric(bank$previous)
bank$pdays <- as.numeric(bank$pdays)

Check outliers

# Checking for outliers
boxplot(bank$age,ylab = 'age') 

boxplot(bank$balance,ylab = 'balance')

boxplot(bank$day, ylab = 'day')

boxplot(bank$duration, ylab = 'duration')

boxplot(bank$campaign, ylab = 'campaign')

hist(bank$previous, ylab = 'previous')

hist(bank$pdays, ylab = 'pdays')

bank2 = outlierKD2(bank, age, rm=T, histogram = F)

QQ-plot

qqnorm(bank2$age, main = "Age: Normal Q-Q Plot") + theme(plot.title = element_text(hjust = 0.5, face = "bold"))
qqline(bank2$age, col= "steelblue", lwd = 2)

qqnorm(bank2$balance)
qqline(bank2$balance, col= "steelblue", lwd = 2)

qqnorm(bank2$day)
qqline(bank2$day, col= "steelblue", lwd = 2)

qqnorm(bank2$duration)
qqline(bank2$duration, col= "steelblue", lwd = 2)

qqnorm(bank2$campaign)
qqline(bank2$campaign, col= "steelblue", lwd = 2)

qqnorm(bank2$previous)
qqline(bank2$previous, col= "steelblue", lwd = 2)

qqnorm(bank2$pdays)
qqline(bank2$pdays, col= "steelblue", lwd = 2)

# lets check yes vs no: subscribed a deposity yes/no
ggplot(bank2, aes( x = y, fill = y)) + geom_bar(colour = "black") + labs(x = "Response", y = "Count", fill = "Response") +
  ggtitle("Subcribed a Deposit: YES/NO") + theme(plot.title = element_text(hjust = 0.5, face = "bold"))

prop.table(table(bank2$y))
# way more no than yes - not balanced 
# will have to correct this

Visualizations

### JOBS vs SUBSCRIBED DEPOSIT

ggplot(bank2, aes(x = job, fill = y)) + geom_bar(colour = "black") + labs( x = "", y = "Count", fill = "Response") +
  ggtitle("Distribution of Job Type") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15, axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))                                                                                                                                                   

### MARTIAL STATUS vs SUBSCRIBED DEPOSIT
ggplot(bank2, aes(x = marital, fill = y)) + geom_bar(colour = "black") + labs( x = "Marital Status", y = "Count", fill = "Response") +
  ggtitle("Distribution of Marital Status") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15) 

### EDUCATION vs SUBSCRIBED DEPOSIT
ggplot(bank2, aes(x = education, fill = y)) + geom_bar(colour = "black") + labs( x = "Education", y = "Count", fill = "Response") +
  ggtitle("Distribution of Education Level") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15) 

###  DEFAULT STATUS
ggplot(bank2, aes(x = default, fill = y)) + geom_bar(colour = "black") + labs( x = "Default Status", y = "Count", fill = "Response") +
  ggtitle("Distribution of Default Status") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15) 

### DISTRIBUTION OF AGE
ggplot(bank2, aes(x = age, fill = y)) + geom_histogram(colour = "black", bins = 30) + labs( x = "Distribution of Age", y = "Count", fill = "Response") +
  ggtitle("Distribution of Marital Status") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15) 

ggplot(bank, aes(x = balance, fill = y)) + geom_histogram(colour = "black") + labs( x = "Balance", y = "Count", fill = "Response") +
  ggtitle("Distribution of Balance") + theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 7/15) 

Data Partition

library(dominanceanalysis)
library(caTools)
set.seed(123)
sample <- sample.split(bank2$y, SplitRatio = 0.7)
train <- subset(bank2, sample == TRUE)
test  <- subset(bank2, sample == FALSE)

Model Construction

Without balancing

mybanklog <- glm(y ~ age+marital+education+default+balance+housing+loan+contact+duration+campaign+pdays+previous+poutcome, data = train, family = "binomial")
summary(mybanklog)
mybanklog2 <- glm(y ~ balance+housing+loan+duration+campaign, data = train, family = "binomial")
summary(mybanklog2)
library(pscl)
pR2(mybanklog2)

Balance the dataset

#install.packages("ROSE")
library(ROSE)
bank_balanced_over <- ovun.sample(y ~., data = train, method = "over", N = 5600)$data
table(bank_balanced_over$y)
# using balanced dataset
mybanklog2b <- glm(y ~ balance+housing+loan+duration+campaign, data = bank_balanced_over, family = "binomial")
summary(mybanklog2b)

Training Set’s Predicted Score

library(ggthemes)
# prediction
train$prediction <- predict(mybanklog2b, newdata = train, type = "response" )
test$prediction  <- predict(mybanklog2b, newdata = test , type = "response" )

# distribution of the prediction score grouped by known outcome
ggplot( train, aes( prediction, color = as.factor(y) ) ) + 
geom_density( size = 1 ) +
ggtitle( "Training Set's Predicted Score" ) + 
scale_color_economist( name = "data", labels = c( "negative", "positive" ) ) + 
theme_economist()

Cost of our models

Unbalanced model

Confusion matrix at 0.5

library(pROC)
test$banklogpred <- predict(mybanklog2, newdata = test, type = "response")
library(data.table)
cutoff <- 0.5
predict <- test$banklogpred
actual  <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
                      ifelse( predict >= cutoff & actual == 0, "FP", 
                      ifelse( predict <  cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) + 
            geom_violin( fill = "white", color = NA ) +
            geom_jitter( shape = 1 ) + 
            geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) + 
            scale_y_continuous( limits = c( 0, 1 ) ) + 
            scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend 
            guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows  
            ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot

table(result$type)

Find the best cutoff and minimal cost

library(ROCR)
cost.fp <- 100
cost.fn <- 200
# At cutoff = 0.5, cost = 200* 127 + 100 * 33 = 31800
pred <- prediction(test$banklogpred, test$y)
perf <- performance(pred,"tpr","fpr")
roc_dt <- data.frame( fpr = perf@x.values[[1]], tpr = perf@y.values[[1]] )
cost <- perf@x.values[[1]] * cost.fp * sum( test$y == 0 ) + 
            ( 1 - perf@y.values[[1]] ) * cost.fn * sum( test$y == 1 )
cost_dt <- data.frame( cutoff = pred@cutoffs[[1]], cost = cost )
best_index  <- which.min(cost)
best_cost   <- cost_dt[ best_index, "cost" ]
best_tpr    <- roc_dt[ best_index, "tpr" ]
best_fpr    <- roc_dt[ best_index, "fpr" ]
best_cutoff <- pred@cutoffs[[1]][ best_index ]
best_cost
best_cutoff

ROC plot & Cost plot

normalize <- function(v) ( v - min(v) ) / diff( range(v) )
col_ramp <- colorRampPalette( c( "green", "orange", "red", "black" ) )(100)   
col_by_cost <- col_ramp[ ceiling( normalize(cost) * 99 ) + 1 ]
roc_plot <- ggplot( roc_dt, aes( fpr, tpr ) ) + 
  geom_line( color = rgb( 0, 0, 1, alpha = 0.3 ) ) +
  geom_point( color = col_by_cost, size = 4, alpha = 0.2 ) + 
  geom_segment( aes( x = 0, y = 0, xend = 1, yend = 1 ), alpha = 0.8, color = "royalblue" ) + 
  labs( title = "ROC", x = "False Postive Rate", y = "True Positive Rate" ) +
  geom_hline( yintercept = best_tpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) +
  geom_vline( xintercept = best_fpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) 
roc_plot

cost_plot <- ggplot( cost_dt, aes( cutoff, cost ) ) +
                 geom_line( color = "blue", alpha = 0.5 ) +
                 geom_point( color = col_by_cost, size = 4, alpha = 0.5 ) +
                 ggtitle( "Cost" ) +
                 scale_y_continuous( labels = comma ) +
                 geom_vline( xintercept = best_cutoff, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) 
cost_plot

reconstruct the confusion matrix at best_cutoff

cutoff <- 0.199
predict <- test$banklogpred
actual  <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
                      ifelse( predict >= cutoff & actual == 0, "FP", 
                      ifelse( predict <  cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) + 
            geom_violin( fill = "white", color = NA ) +
            geom_jitter( shape = 1 ) + 
            geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) + 
            scale_y_continuous( limits = c( 0, 1 ) ) + 
            scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend 
            guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows  
            ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot

table(result$type)

Balanced model

Confusion matrix at cutoff = 0.5

cutoff <- 0.5
predict <- test$prediction
actual  <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
                      ifelse( predict >= cutoff & actual == 0, "FP", 
                      ifelse( predict <  cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) + 
            geom_violin( fill = "white", color = NA ) +
            geom_jitter( shape = 1 ) + 
            geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) + 
            scale_y_continuous( limits = c( 0, 1 ) ) + 
            scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend 
            guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows  
            ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot

table(result$type)

Find the best cutoff and minimal cost

library(ROCR)
cost.fp <- 100
cost.fn <- 200
# At cutoff = 0.5, cost = 200* 41 + 100 * 236 = 31800
pred <- prediction(test$prediction, test$y)
perf <- performance(pred,"tpr","fpr")
roc_dt <- data.frame( fpr = perf@x.values[[1]], tpr = perf@y.values[[1]] )
cost <- perf@x.values[[1]] * cost.fp * sum( test$y == 0 ) + 
            ( 1 - perf@y.values[[1]] ) * cost.fn * sum( test$y == 1 )
cost_dt <- data.frame( cutoff = pred@cutoffs[[1]], cost = cost )
best_index  <- which.min(cost)
best_cost   <- cost_dt[ best_index, "cost" ]
best_tpr    <- roc_dt[ best_index, "tpr" ]
best_fpr    <- roc_dt[ best_index, "fpr" ]
best_cutoff <- pred@cutoffs[[1]][ best_index ]
best_cost
best_cutoff

ROC plot & Cost plot

normalize <- function(v) ( v - min(v) ) / diff( range(v) )
col_ramp <- colorRampPalette( c( "green", "orange", "red", "black" ) )(100)   
col_by_cost <- col_ramp[ ceiling( normalize(cost) * 99 ) + 1 ]
roc_plot <- ggplot( roc_dt, aes( fpr, tpr ) ) + 
  geom_line( color = rgb( 0, 0, 1, alpha = 0.3 ) ) +
  geom_point( color = col_by_cost, size = 4, alpha = 0.2 ) + 
  geom_segment( aes( x = 0, y = 0, xend = 1, yend = 1 ), alpha = 0.8, color = "royalblue" ) + 
  labs( title = "ROC", x = "False Postive Rate", y = "True Positive Rate" ) +
  geom_hline( yintercept = best_tpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) +
  geom_vline( xintercept = best_fpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) 
roc_plot

cost_plot <- ggplot( cost_dt, aes( cutoff, cost ) ) +
                 geom_line( color = "blue", alpha = 0.5 ) +
                 geom_point( color = col_by_cost, size = 4, alpha = 0.5 ) +
                 ggtitle( "Cost" ) +
                 scale_y_continuous( labels = comma ) +
                 geom_vline( xintercept = best_cutoff, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) 
cost_plot

reconstruct the confusion matrix at best_cutoff

cutoff <- 0.703
predict <- test$prediction
actual  <- relevel( as.factor( test$y ), "1" )
result <- data.table( actual = actual, predict = predict )
result[ , type := ifelse( predict >= cutoff & actual == 1, "TP",
                      ifelse( predict >= cutoff & actual == 0, "FP", 
                      ifelse( predict <  cutoff & actual == 1, "FN", "TN" ) ) ) %>% as.factor() ]
plot <- ggplot( result, aes( actual, predict, color = type ) ) + 
            geom_violin( fill = "white", color = NA ) +
            geom_jitter( shape = 1 ) + 
            geom_hline( yintercept = cutoff, color = "blue", alpha = 0.6 ) + 
            scale_y_continuous( limits = c( 0, 1 ) ) + 
            scale_color_discrete( breaks = c( "TP", "FN", "FP", "TN" ) ) + # ordering of the legend 
            guides( col = guide_legend( nrow = 2 ) ) + # adjust the legend to have two rows  
            ggtitle( sprintf( "Confusion Matrix with Cutoff at %.3f", cutoff ) )
plot

table(result$type)

Interpretation and Reasoning

expcoeff = exp(coef(mybanklog2b))
# expcoeff
xkabledply( as.table(expcoeff), title = "Exponential of coefficients" )

McFadden

Look at the McFadden value.

library(pscl)
pR2(mybanklog2b)

With the McFadden value of 0.284, about 28.4% of the variations in y is explained by the explanatory variables in the model.

Predictors importance

Null deviance

Interpret the table of null deviance.

anova(mybanklog2b, test="Chisq")

In our case, we can see that adding duration alone reduces the deviance drastically (i.e., from 7507 to 5665). The small deviance value of slope indicates this variable does not add much to the model, meaning that almost the same amount of variation is explained when this variable is added.

library(dominanceanalysis)
dabank <- dominanceAnalysis(mybanklog2b)

Dominance analysis

Complete dominance matrix

dominanceMatrix(dabank, type="complete",fit.functions = "r2.m", ordered=TRUE)

This complete dominance matrix summarizes the relation between each pair of predictors. If the value between two predictors is 1, the predictor under the first column completely dominates the other predictor of the pair. If the value is 0, the predictor under the first column is completely dominated by the other predictor of the pair. Lastly, if the value is 0.5, complete dominance could not be established between the pair.

General dominance

averageContribution(dabank, fit.functions = "r2.m")
plot(dabank, which.graph ="general",fit.function = "r2.m")

To determine general dominance, we compute the mean of each predictor’s conditional measures. We conclude that duration has the highest value (0.254) and generally dominates all other predictors.

Reference

Useful functions when working with logistic regression https://github.com/ethen8181/machine-learning/blob/master/unbalanced/unbalanced_code/unbalanced_functions.R

Exploring predictors’ importance in binomial logistic regressions https://cran.r-project.org/web/packages/dominanceanalysis/vignettes/da-logistic-regression.html